********************************************************************************
**	PURPOSE: Identify potential sources of heterogeneity by comparing terciles of CATEs and terciles of the row variables, 
**			 calculate Q-values and perform LASSO,
**			 create T5, Fig3							
**
**	INPUTS: HTE_ficscore08_numloans_e1_df
**			HTE_ficscore08_numloans_e2_df
**			HTE_ficscore08_numloans_e3_df
**			HTE_scoredf_numloans_e1_df
**			HTE_scoredf_numloans_e2_df
**			HTE_scoredf_numloans_e3_df
**	
**	INTERMEIDATE OUTPUTS: HTE_ficoscore08_e(1/2/3)_df.dta
**			 			  HTE_scoredf_e(1/2/3__df.dta
**						  CausalForest_tercile_Qval_(1/2/3_.dta
**
**	FINAL OUTPUTS: Het_search_ficoscore08_ e(1/2/3)Ter.xlsx 
**			 	  Het_search_scoredf_ e(1/2/3)Ter.xlsx 
**				  Het_search_ficoscore08_ e(1/2/3)Ter_qval.xlsx 
**			 	  Het_search_scoredf_ e(1/2/3)Ter_qval.xlsx 
**				  LASSO(1/2).log
**				  lasso_questions_e(1/2).dta
**				  Combined CATES.png
**				  
**  CREATED/EDITED BY: Kayla Wilding, Leah Kim, Hasan Ahamed
**
**	DATE CREATED: 5/28/2018
**
**	DATE LAST EDITED: 2/28/2023
********************************************************************************

/*******************************************************************************
0. Setup 
********************************************************************************/
clear all

local hetvars age female married adults child race_black college z_risk_i z_contknow_i1 z_liqcf_i1 bfico z_credac_i1 z_credac_rev_i1 prevloans1

local binaryvars female race_black married college 
local contvars age adults child z_risk_i z_contknow_i1 z_liqcf_i1 z_credac_i1 bfico prevloans1 z_credac_rev_i1

set autotabgraphs on

*we would use an ado file inside this do file to create these programs, therefore dropping if it already exists
cap program drop select_questions_lasso_stable choose_stable

/*******************************************************************************
1. Copmare Terciles of CATE or Terciles of Baseline Variables, and Compute Q Values
********************************************************************************/
*Looping through dependent vars and endlines (each causal forest) 
foreach dep in ficoscore08 scoredf { 
	qui forval x = 1/3 {		
		import delimited using "$outputtables/HTE_`dep'_numloans_e`x'_df.csv", clear
		merge 1:1 surveyid using "$adta/Wide with outcomes.dta", keepusing(z_contknow_i1 z_liqcf_i1 z_credac_i1 z_credac_rev_i1 prevloans1 ltca htca mtca flag_randomized flag_matched_scored_base flag_cblgroup flag_extragroup op18 ficoscore08h*) keep(1 3)

		{ //label vars- labels got lost in R
		la var age "Age"
		la var female "Female"
		la var married "Married"
		la var adults "# Adults in HH"
		la var child "# Children in HH"
		la var race_black "Race - Black"
		la var college "1= College or more"
		cap la var bfico 	"Baseline FICO score"
		la var z_risk_i "Financial Risk Taking Scale"
		la var z_contknow_i1 "Self control and knowledge index"
		la var z_liqcf_i1 "Liquidity Index (standardized)"
		la var z_credac_i1 "Installment Credit Access at Baseline (standardized)"
		la var z_credac_rev_i1 "Revolving Credit Access at Baseline (standardized)"
		la var prevloans1 "Previous loans"
		}
			
		{ // Labelling Top/Bottom Terciles
		*Split the CATE variable into terciles 
		xtile ter_`dep' = t1_s1_pre,n(3)
		tab ter_`dep', gen(flag_ter_`dep')
		gen highest_ter = (ter_`dep'==3)
		gen lowest_ter = (ter_`dep'==1)	

		*Split the baseline vars into terciles			
		foreach covar in `hetvars' {
			if "`dep'" == "scoredf" & "`covar'" == "bfico" {
			continue
			}
			xtile `covar'3ile = `covar',n(3)
			gen h_ter`covar' = (`covar'3ile==3)
			gen l_ter`covar' = (`covar'3ile==1)
		}												
		}	

		save "$adta/HTE_`dep'_e`x'_df.dta", replace
		
		** Reg of outcome on each baseline var
		*Mean (SE) of Baseline Variables for Observations in 
		*Terciles of Predicted Treatment Effect (CATE)
		if "`dep'" == "ficoscore08" {	
		** Setting up Results Matrix
		mat results = J(`:word count `hetvars''*2,6,.)
		mat results_q = J(`:word count `hetvars''*2,2,.)
		mat coln results = L_ter_CATE H_ter_CATE P_CATE L_ter_char H_ter_char P_char 
		mat rown results = `hetvars'
		mat coln results_q = pval spec

		loc row_b = 1
		loc row_b2 = 1
		foreach regvar in `hetvars' {
			*Output the mean of the baseline variable by tercile of CATEs
			reg `regvar' lowest_ter highest_ter if inlist(ter_`dep',1,3), nocons
			mat raw = r(table)
			** Coefs and SEs
			mat list raw
			mat results[`row_b',1]=raw[1,1]
			mat results[`row_b',2]=raw[1,2]
			mat results[`row_b'+1,1]=raw[2,1]
			mat results[`row_b'+1,2]=raw[2,2]
			
			* Difference test
			test highest_ter == lowest_ter
				mat results[`row_b',3]=`r(p)'
				mat results_q[`row_b2',1]=`r(p)'
				mat results_q[`row_b2',2]= 1			
			
			*ATE by the tercile of the baseline variables
			if (`:list regvar in binaryvars') {
				reg `dep' enc if `regvar' == 0
				*estimates store low
				mat results[`row_b',4]=_b[enc]
				mat results[`row_b'+1,4]=_se[enc]
				
				reg `dep' enc if `regvar' == 1
				mat results[`row_b',5]=_b[enc]
				mat results[`row_b'+1,5]=_se[enc]
				*estimates store high 
			}

			else if (`:list regvar in contvars') {
				reg `dep' enc if `regvar'3ile == 1
				*estimates store low
				mat results[`row_b',4]=_b[enc]
				mat results[`row_b'+1,4]=_se[enc]
				
				reg `dep' enc if `regvar'3ile == 3
				*estimates store high 
				mat results[`row_b',5]=_b[enc]
				mat results[`row_b'+1,5]=_se[enc]
			}
			
			if (`:list regvar in binaryvars') {
				reg `dep' enc##i.`regvar'
				loc coefeqpval r(table)["pvalue","1.enc#1.`regvar'"]
			}
			else if (`:list regvar in contvars') {
				reg `dep' enc##i.`regvar'3ile
				loc coefeqpval r(table)["pvalue","1.enc#3.`regvar'3ile"]
			}
			
			mat results[`row_b',6]=`coefeqpval'
			mat results_q[`=`row_b2'+`:word count `hetvars''',1]=`coefeqpval'
			mat results_q[`=`row_b2'+`:word count `hetvars''',2]= 2
			loc row_b =`row_b'+2
			loc row_b2 =`row_b2'+1	
			
		} // End Het Loop
		
		*Create Q values 
		{	
		preserve
		clear
		svmat results_q, names(col)
		
		egen varname = repeat(), v(`hetvars')
		gen table_order = _n
		gen group = . 

		la de group 1 "Demographic" 2 "Qualitative Financial Behaviors and Preferences" 3 "Prior Credit History" 	

		replace group = 1 if inlist(varname, "age", "female", "married", "adults", "child", "race_black", "college") & spec ==1					
		replace group = 2 if inlist(varname, "z_risk_i", "z_contknow_i1", "z_liqcf_i1") & spec == 1 
		replace group = 3 if inlist(varname, "bfico", "z_credac_i1", "z_credac_rev_i1", "prevloans1")  & spec == 1  	
		
		replace group = 4 if inlist(varname, "age", "female", "married", "adults", "child", "race_black", "college") & spec == 2	
		replace group = 5 if inlist(varname, "z_risk_i", "z_contknow_i1", "z_liqcf_i1") & spec == 2	
		replace group = 6 if inlist(varname, "bfico", "z_credac_i1", "z_credac_rev_i1", "prevloans1")  & spec == 2
		
		egen var = seq(), to(`:word count `hetvars'')
		sort pval
		bysort group (pval): gen rank = _n		// rank of p values
		bysort group: gen count = _N
		assert !missing(pval)
		gen q = .

		forvalues qval = .999(-.001).000 {	//loop from .999 to .000 in increments of .001 
			* Generate qr/N
			qui gen fdr_temp = `qval' * rank / count
			* Generate binary variable checking condition p(r) <= qr/N
			qui gen reject_temp = (fdr_temp>=pval) if !mi(fdr_temp)
			* Generate variable containing p-value ranks for all p-values that meet above condition
			qui gen reject_rank = reject_temp*rank
			* Record the rank of the largest p-value that meets above condition
			bysort group: egen total_rejected = max(reject_rank)
			* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
			replace q = `qval' if rank <= total_rejected & !mi(rank)
			* Remove variables
			qui drop fdr_temp reject_temp reject_rank total_rejected
		}			
		sort table_order	
		save "$adta/CausalForest_tercile_Qval_`x'.dta", replace	
		reshape wide pval rank count q table_order group, i(var) j(spec)
		matrix results2 = results
										
		forval i=1/`:word count `hetvars'' {
		local row_b3 = `i'*2 - 1
		matrix results2[`row_b3',3] = q1[`i']
		matrix results2[`row_b3',6] = q2[`i']
		}
		restore
		
		}
		
		putexcel set "$outputtables/Het_Search_`dep'_ e`x'Ter.xlsx", replace
		putexcel B3 = matrix(results), names nformat(number_d2)
		putexcel B1 = "Terciles: `dep'_numloans_e`x'"
		local rownum : word count `hetvars'
		forval y = 1/`rownum' {
			local varname : word `y' of `hetvars'
			local varlabel : variable label `varname'
			local rownum = `y'*2 + 3
			putexcel A`rownum' = "`varlabel'"
			}
		
		putexcel set "$outputtables/Het_Search_`dep'_ e`x'Ter_qval.xlsx", replace
		putexcel B3 = matrix(results2), names nformat(number_d2)
		putexcel B1 = "Terciles: `dep'_numloans_e`x' with Q values"
		local rownum : word count `hetvars'
		forval y = 1/`rownum' {
			local varname : word `y' of `hetvars'
			local varlabel : variable label `varname'
			local rownum = `y'*2 + 3
			putexcel A`rownum' = "`varlabel'"
			}

		} //Close if ficoscore								
	} //Close forval loop through endlines
} //Close loop through dependent vars 


/*******************************************************************************
2. Conduct LASSO
********************************************************************************/

local hetvars age female married adults child race_black college z_risk_i z_contknow_i1 z_liqcf_i1 bfico z_credac_i1 z_credac_rev_i1 prevloans1

*run the ado file to be able to run select_questions_lasso_stable command
do "$ado/select_questions_lasso_stable.ado"

forval i = 1/2 {
 

use "$adta/HTE_ficoscore08_e`i'_df.dta", clear
*No missing covariates for each CATE

*10-fold CV method
lasso linear t1_s1_pre `hetvars', selection(cv, folds(10)) rseed(1234) 
display e(allvars_sel) // Generate list of variables selected 
lassoknots // This unfortunately does not store the order of variables added to the model -- need to refer to log file
matrix list r(table)

*Lasso stability selection - this an ado file from the paper - "Using machine learning and qualitative interviews to design a five-question survey module for women's agency"
*There should be a folder named xx.ado where you should find this ado file
*syntax: select_questions_lasso_stable [y variable] [list of questions in string][path of output file][# bootstrap samples][subsample size as %][datapath][seed][method][fold]
*Draw N subsamples and run lasso on each subsample. 
*Need to set global output before running select_questions_lasso_stable
global output $outputtables
select_questions_lasso_stable t1_s1_pre "`hetvars'" "$outputtables/lasso_questions_e`i'.dta" 100 50 "$adta/HTE_ficoscore08_e`i'_df.dta" 1234 "cv" 10


}

*reorder variables for easy cell referencing in excel
forval i = 1/2 {
use "$outputtables/lasso_questions_e`i'.dta", clear
gen order = . 
forvalues t = 1/`: word count `hetvars''{
    local var `: word `t' of `hetvars''
	replace order = `t' if strpos(recoding,"`var'") > 0
}
sort order
export excel "$outputtables/lasso_questions_e`i'.xlsx", replace

}

/*******************************************************************************
3. Figure 3 
********************************************************************************/

{
foreach dep in ficoscore08 scoredf {
	forval x = 1/3 {
		use "$adta/HTE_`dep'_e`x'_df.dta", clear
	*Graph the rank-order CATEs
				keep if !mi(enc) 
				sort t1_s1_pred, stable
				gen order = _n
				if "`dep'" == "ficoscore08" {
					local outcome = "FICO Score"
					local ylabel "ylabel(-30(15)15)"
					local let = "B"
				}
				else {
					local outcome = "1 = Scored on FICO"
					local ylabel "ylabel(-.2(.1).2)"
					local let = "A"
				}
				if `x' == 1 {
					local endline = "6 month endline"
				}
				else if `x' == 2 {
					local endline = "12 month endline"
				}
				else {
					local endline = "18 month endline"
				}
				
				
				twoway (scatter t1_s1_pred order) , yline(0) ///
				xtitle("Rank order of observations") ytitle("CATE") ///
				subtitle("`let'`x'. `outcome', `endline'") xlabel(0(250)1500) `ylabel'
				graph save "$gph/`dep'_num_e`x'w.gph", replace

	}
}
cap erase "$gph/Combined CATES.gph"

*combine all the graphs together			
#delimit ;
graph combine "$gph/scoredf_num_e1w.gph" "$gph/scoredf_num_e2w.gph" "$gph/scoredf_num_e3w.gph" "$gph/ficoscore08_num_e1w.gph" "$gph/ficoscore08_num_e2w.gph" "$gph/ficoscore08_num_e3w.gph" , 
				altshrink 
				saving("$gph/Combined CATES.gph") 
				title("Figure 3. CATE plots for each outcome at each endline") 
				subtitle("", span) 
				scheme(s1mono) 
				iscale(1)
				imargin(medsmall);
#delimit cr ;

graph save "$gph/Combined CATES new.gph", replace
graph export "$gph/Combined CATES new.png", replace as(png)
}
	
